R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

#install.packages("tidytext")
library(gutenbergr)
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidytext)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ readr   2.1.3
## ✔ tibble  3.1.8     ✔ purrr   0.3.5
## ✔ tidyr   1.2.1     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
gutenberg_metadata <- gutenberg_metadata

Q1 Find the gutenberg IDs of Treasure Island and Kidnapped by Robert Louis Stevenson using the gutenberg_metadata data frame available in the gutenberg package.

treasure_island <- gutenberg_metadata %>% filter(str_detect(gutenberg_metadata$title,"Treasure Island" ) & str_detect(gutenberg_metadata$author, "Stevenson, Robert Louis"))

kidnapped <- gutenberg_metadata %>% filter(str_detect(gutenberg_metadata$title,"Kidnapped" ) & str_detect(gutenberg_metadata$author, "Stevenson, Robert Louis"))
treasure_island_id <- treasure_island$gutenberg_id
kidnapped_id <- kidnapped$gutenberg_id
#Gutenberg IDs of Treasure Island and Kidnapped
cat("Gutenberg IDs of Treasure Island: ", treasure_island_id, "\n")
## Gutenberg IDs of Treasure Island:  120 23936 27780
cat("Gutenberg IDs of Kidnapped: ", kidnapped_id, "\n")
## Gutenberg IDs of Kidnapped:  421 56562

##Q2 Download the texts of these two books from the gutenberg package.

#Download Treasure Island
treasure_island_book <- gutenberg_works(title == "Treasure Island", author == "Stevenson, Robert Louis")[1]
treasure_island_book_text <- gutenberg_download(treasure_island_book$gutenberg_id)
## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
#Download Kidnapped
kidnapped_book <- gutenberg_works(title == "Kidnapped", author == "Stevenson, Robert Louis")[1]
kidnapped_book_text <- gutenberg_download(kidnapped_book$gutenberg_id)

##Q3 Find the 10 most common words (that are not stop words) in each novel.

#tokenizing the text
treasure_island_words <- treasure_island_book_text %>% unnest_tokens(word,text)
kidnapped_words <- kidnapped_book_text %>% unnest_tokens(word,text)
#word counts after removing stop words
treasure_island_filtered_wordcount <- treasure_island_words %>% anti_join(stop_words) %>% count(word,sort=TRUE)
## Joining, by = "word"
kidnapped_filtered_wordcount <- kidnapped_words %>% anti_join(stop_words) %>% count(word,sort=TRUE)
## Joining, by = "word"
#Top 10 words
top10_treasure_island <- treasure_island_filtered_wordcount %>% head(10)
top10_kidnapped <- kidnapped_filtered_wordcount %>% head(10)

cat("TOP 10 words in Treasure Island")
## TOP 10 words in Treasure Island
top10_treasure_island$word
##  [1] "captain" "silver"  "doctor"  "time"    "hand"    "sea"     "hands"  
##  [8] "i’ll"    "cried"   "sir"
cat("TOP 10 words in Kidnapped")
## TOP 10 words in Kidnapped
top10_kidnapped$word
##  [1] "alan"    "ye"      "house"   "time"    "set"     "hand"    "sir"    
##  [8] "cried"   "day"     "country"
# Combine top100 word counts for both books into a single data frame 
#(used top 50 only for readability purpose: can do for whole dataframe directly using <book>_filtered_wordcount )
top100_treasure_island <- treasure_island_filtered_wordcount %>% head(50)
top100_kidnapped <- kidnapped_filtered_wordcount %>% head(50)


word_counts <- bind_rows(
  mutate(top100_treasure_island, book = "Treasure Island"),
  mutate(top100_kidnapped, book = "Kidnapped")
)

# Calculate proportion of each word in each book
book_word_proportions <- word_counts %>%
  group_by(book) %>%
  mutate(total_words = sum(n),
         proportion = n / total_words) %>%
  ungroup()

# Create scatterplot of word proportions
proportion_plot <- ggplot(book_word_proportions, aes(x = proportion, y = reorder(word, -proportion), color = book)) +
  geom_point() +
  #scale_x_continuous(labels = scales::percent_format()) +
  labs(x = "Proportion of Non-Stop Words", y = "Word", color = "Book")

ggplotly()
# Cleaning up to retain only words and removing stop words
frequency <- bind_rows(mutate(treasure_island_words, book = "Treasure Island"),
mutate(kidnapped_words, book = "Kidnapped")) %>% 
mutate(word = str_extract(word, "[a-z]+")) %>% anti_join(stop_words)
## Joining, by = "word"
# Calculate proportion of words used by both authors
frequency <- frequency %>%
count(book, word) %>% 
group_by(book) %>% 
mutate(proportion = n / sum(n)) %>%
select(-n)
# Creating separate author columns for comparison
frequency <- frequency %>%
pivot_wider(names_from = "book", values_from = "proportion")

head(frequency,5)
## # A tibble: 5 × 3
##   word      Kidnapped `Treasure Island`
##   <chr>         <dbl>             <dbl>
## 1 ab        0.0000401        NA        
## 2 aback     0.0000401         0.000134 
## 3 abandoned 0.0000401         0.0000896
## 4 abashed   0.000120         NA        
## 5 abate     0.0000401        NA

Plotting the proportions of words used in the Treasure Island and Kidnapped

cor_plot<-ggplot(frequency, aes(x = `Treasure Island`, y = `Kidnapped`)) +
  geom_abline(color = "red", lty = 2, lwd=2) +
  geom_point(color="blue")+ 
  geom_text(aes(label = word), check_overlap = TRUE) + 
  scale_x_log10() + scale_y_log10()

ggplotly()

Correlation between the words in both novels :

frequency %>%
filter(!(`Treasure Island`=="NA"|`Kidnapped`=="NA")) %>% 
  select(,2:3) %>%
  cor()
##                 Kidnapped Treasure Island
## Kidnapped       1.0000000       0.4429402
## Treasure Island 0.4429402       1.0000000

##Find two words that appear with a high frequency in Kidnapped but not in Treasure Island.

Answer : Words “alan”,“ye” are one of the many that appear with high frequency in Kidnapped but not in Treasure Island.

##Find two words that appear with a high frequency in Treasure Island but not in Kidnapped.

Answer : Words “tom”,“cap” are one of the many that appear with high frequency in Treasure Island but not in Kidnapped.

##Find two words that appear with high frequency in both novels.

Answer : Words “time”,“cried” are the ones that appear with high frequency in both novels.

Q5 :

Find the 10 most common bigrams in Treasure Island that do not include stop words.

common_bigrams <- treasure_island_book_text %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>% 
  filter(bigram != "NA")%>% 
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!word1 %in% stop_words$word) %>% 
  filter(!word2 %in% stop_words$word) %>% 
  unite(bigram, word1, word2, sep=" ") %>% 
  count(bigram, sort = TRUE)%>%head(10)

common_bigrams
## # A tibble: 10 × 2
##    bigram               n
##    <chr>            <int>
##  1 dr livesey          38
##  2 ben gunn            31
##  3 captain smollett    29
##  4 spy glass           24
##  5 black dog           19
##  6 block house         17
##  7 admiral benbow      15
##  8 cried silver        15
##  9 john silver         15
## 10 log house           14

The 10 most common bigrams in Treasure Island that do not include stop words are :

“dr livesey”, “ben gunn”, “captain smollett”, “spy glass”, “black dog”, “block house”, “admiral benbow”, “cried silver”, “john silver”, “log house”

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.